mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Added sents property to Span for Spans spanning over several sentences (#9699)
* Added sents property to Span class that returns a generator of sentences the Span belongs to * Added description to Span.sents property * Update test_span to clarify the difference between span.sent and span.sents Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/doc/test_span.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix documentation typos in spacy/tokens/span.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update Span.sents doc string in spacy/tokens/span.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Parametrized test_span_spans * Corrected Span.sents to check for span-level hook first. Also, made Span.sent respect doc-level sents hook if no span-level hook is provided * Corrected Span ocumentation copy/paste issue * Put back accidentally deleted lines * Fixed formatting in span.pyx * Moved check for SENT_START annotation after user hooks in Span.sents * add version where the property was introduced Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
7d50804644
commit
472740d613
|
@ -200,6 +200,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
||||||
assert doc[:2].sent.root.text == "is"
|
assert doc[:2].sent.root.text == "is"
|
||||||
assert doc[:2].sent.text == "This is a sentence."
|
assert doc[:2].sent.text == "This is a sentence."
|
||||||
assert doc[6:7].sent.root.left_edge.text == "This"
|
assert doc[6:7].sent.root.left_edge.text == "This"
|
||||||
|
assert doc[0 : len(doc)].sent == list(doc.sents)[0]
|
||||||
|
assert list(doc[0 : len(doc)].sents) == list(doc.sents)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc_not_parsed[:2].sent
|
||||||
|
|
||||||
# test on manual sbd
|
# test on manual sbd
|
||||||
doc_not_parsed[0].is_sent_start = True
|
doc_not_parsed[0].is_sent_start = True
|
||||||
doc_not_parsed[5].is_sent_start = True
|
doc_not_parsed[5].is_sent_start = True
|
||||||
|
@ -207,6 +213,35 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
||||||
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
|
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"start,end,expected_sentence",
|
||||||
|
[
|
||||||
|
(0, 14, "This is"), # Entire doc
|
||||||
|
(1, 4, "This is"), # Overlapping with 2 sentences
|
||||||
|
(0, 2, "This is"), # Beginning of the Doc. Full sentence
|
||||||
|
(0, 1, "This is"), # Beginning of the Doc. Part of a sentence
|
||||||
|
(10, 14, "And a"), # End of the Doc. Overlapping with 2 senteces
|
||||||
|
(12, 14, "third."), # End of the Doc. Full sentence
|
||||||
|
(1, 1, "This is"), # Empty Span
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_spans_span_sent_user_hooks(doc, start, end, expected_sentence):
|
||||||
|
|
||||||
|
# Doc-level sents hook
|
||||||
|
def user_hook(doc):
|
||||||
|
return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
|
||||||
|
|
||||||
|
doc.user_hooks["sents"] = user_hook
|
||||||
|
|
||||||
|
# Make sure doc-level sents hook works
|
||||||
|
assert doc[start:end].sent.text == expected_sentence
|
||||||
|
|
||||||
|
# Span-level sent hook
|
||||||
|
doc.user_span_hooks["sent"] = lambda x: x
|
||||||
|
# Now, span=level sent hook overrides the doc-level sents hook
|
||||||
|
assert doc[start:end].sent == doc[start:end]
|
||||||
|
|
||||||
|
|
||||||
def test_spans_lca_matrix(en_tokenizer):
|
def test_spans_lca_matrix(en_tokenizer):
|
||||||
"""Test span's lca matrix generation"""
|
"""Test span's lca matrix generation"""
|
||||||
tokens = en_tokenizer("the lazy dog slept")
|
tokens = en_tokenizer("the lazy dog slept")
|
||||||
|
@ -536,3 +571,38 @@ def test_span_with_vectors(doc):
|
||||||
# single-token span with vector
|
# single-token span with vector
|
||||||
assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1])
|
assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1])
|
||||||
doc.vocab.vectors = prev_vectors
|
doc.vocab.vectors = prev_vectors
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"start,end,expected_sentences,expected_sentences_with_hook",
|
||||||
|
[
|
||||||
|
(0, 14, 3, 7), # Entire doc
|
||||||
|
(3, 6, 2, 2), # Overlapping with 2 sentences
|
||||||
|
(0, 4, 1, 2), # Beginning of the Doc. Full sentence
|
||||||
|
(0, 3, 1, 2), # Beginning of the Doc. Part of a sentence
|
||||||
|
(9, 14, 2, 3), # End of the Doc. Overlapping with 2 senteces
|
||||||
|
(10, 14, 1, 2), # End of the Doc. Full sentence
|
||||||
|
(11, 14, 1, 2), # End of the Doc. Partial sentence
|
||||||
|
(0, 0, 1, 1), # Empty Span
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with_hook):
|
||||||
|
|
||||||
|
assert len(list(doc[start:end].sents)) == expected_sentences
|
||||||
|
|
||||||
|
def user_hook(doc):
|
||||||
|
return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
|
||||||
|
|
||||||
|
doc.user_hooks["sents"] = user_hook
|
||||||
|
|
||||||
|
assert len(list(doc[start:end].sents)) == expected_sentences_with_hook
|
||||||
|
|
||||||
|
doc.user_span_hooks["sents"] = lambda x: [x]
|
||||||
|
|
||||||
|
assert list(doc[start:end].sents)[0] == doc[start:end]
|
||||||
|
assert len(list(doc[start:end].sents)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_sents_not_parsed(doc_not_parsed):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(Span(doc_not_parsed, 0, 3).sents)
|
||||||
|
|
|
@ -404,6 +404,10 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
if "sent" in self.doc.user_span_hooks:
|
if "sent" in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks["sent"](self)
|
return self.doc.user_span_hooks["sent"](self)
|
||||||
|
elif "sents" in self.doc.user_hooks:
|
||||||
|
for sentence in self.doc.user_hooks["sents"](self.doc):
|
||||||
|
if sentence.start <= self.start < sentence.end:
|
||||||
|
return sentence
|
||||||
# Use `sent_start` token attribute to find sentence boundaries
|
# Use `sent_start` token attribute to find sentence boundaries
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
if self.doc.has_annotation("SENT_START"):
|
if self.doc.has_annotation("SENT_START"):
|
||||||
|
@ -422,6 +426,47 @@ cdef class Span:
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E030)
|
raise ValueError(Errors.E030)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def sents(self):
|
||||||
|
"""Obtain the sentences that contain this span. If the given span
|
||||||
|
crosses sentence boundaries, return all sentences it is a part of.
|
||||||
|
|
||||||
|
RETURNS (Iterable[Span]): All sentences that the span is a part of.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/span#sents
|
||||||
|
"""
|
||||||
|
cdef int start
|
||||||
|
cdef int i
|
||||||
|
|
||||||
|
if "sents" in self.doc.user_span_hooks:
|
||||||
|
yield from self.doc.user_span_hooks["sents"](self)
|
||||||
|
elif "sents" in self.doc.user_hooks:
|
||||||
|
for sentence in self.doc.user_hooks["sents"](self.doc):
|
||||||
|
if sentence.end > self.start:
|
||||||
|
if sentence.start < self.end or sentence.start == self.start == self.end:
|
||||||
|
yield sentence
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if not self.doc.has_annotation("SENT_START"):
|
||||||
|
raise ValueError(Errors.E030)
|
||||||
|
# Use `sent_start` token attribute to find sentence boundaries
|
||||||
|
# Find start of the 1st sentence of the Span
|
||||||
|
start = self.start
|
||||||
|
while self.doc.c[start].sent_start != 1 and start > 0:
|
||||||
|
start -= 1
|
||||||
|
|
||||||
|
# Now, find all the sentences in the span
|
||||||
|
for i in range(start + 1, self.doc.length):
|
||||||
|
if self.doc.c[i].sent_start == 1:
|
||||||
|
yield Span(self.doc, start, i)
|
||||||
|
start = i
|
||||||
|
if start >= self.end:
|
||||||
|
break
|
||||||
|
if start < self.end:
|
||||||
|
yield Span(self.doc, start, self.end)
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ents(self):
|
def ents(self):
|
||||||
"""The named entities in the span. Returns a tuple of named entity
|
"""The named entities in the span. Returns a tuple of named entity
|
||||||
|
|
|
@ -518,6 +518,27 @@ sent = doc[sent.start : max(sent.end, span.end)]
|
||||||
| ----------- | ------------------------------------------------------- |
|
| ----------- | ------------------------------------------------------- |
|
||||||
| **RETURNS** | The sentence span that this span is a part of. ~~Span~~ |
|
| **RETURNS** | The sentence span that this span is a part of. ~~Span~~ |
|
||||||
|
|
||||||
|
## Span.sents {#sents tag="property" model="sentences" new="3.2.1"}
|
||||||
|
|
||||||
|
Returns a generator over the sentences the span belongs to. This property is only available
|
||||||
|
when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
|
||||||
|
document by the `parser`, `senter`, `sentencizer` or some custom function. It
|
||||||
|
will raise an error otherwise.
|
||||||
|
|
||||||
|
If the span happens to cross sentence boundaries, all sentences the span overlaps with will be returned.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Give it back! He pleaded.")
|
||||||
|
> span = doc[2:4]
|
||||||
|
> assert len(span.sents) == 2
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | -------------------------------------------------------------------------- |
|
||||||
|
| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user