span.ent only returns first sentence (#7084)

* return first sentence when span contains sentence boundary * docs fix * small fixes * cleanup
2025-08-10 07:04:53 +03:00 · 2021-02-19 13:02:38 +01:00 · 2021-02-19 13:02:38 +01:00 · 709c9e75af
commit 709c9e75af
parent 30e1a89aeb
7 changed files with 63 additions and 14 deletions
--- a/spacy/tests/regression/test_issue7029.py
+++ b/spacy/tests/regression/test_issue7029.py
@ -61,7 +61,6 @@ def test_issue7029():
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    texts = ["first", "second", "third", "fourth", "and", "then", "some", ""]
-    nlp.select_pipes(enable=["tok2vec", "tagger"])
    docs1 = list(nlp.pipe(texts, batch_size=1))
    docs2 = list(nlp.pipe(texts, batch_size=4))
    assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
--- a/spacy/tests/regression/test_issue7056.py
+++ b/spacy/tests/regression/test_issue7056.py
@ -1,5 +1,3 @@
-import pytest
-
 from spacy.tokens.doc import Doc
 from spacy.vocab import Vocab
 from spacy.pipeline._parser_internals.arc_eager import ArcEager
--- a/spacy/tests/regression/test_issue7065.py
+++ b/spacy/tests/regression/test_issue7065.py
@ -0,0 +1,18 @@
+from spacy.lang.en import English
+
+
+def test_issue7065():
+    text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival."
+    nlp = English()
+    nlp.add_pipe("sentencizer")
+    ruler = nlp.add_pipe("entity_ruler")
+    patterns = [{"label": "THING", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}]
+    ruler.add_patterns(patterns)
+
+    doc = nlp(text)
+    sentences = [s for s in doc.sents]
+    assert len(sentences) == 2
+    sent0 = sentences[0]
+    ent = doc.ents[0]
+    assert ent.start < sent0.end < ent.end
+    assert sentences.index(ent.sent) == 0
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -357,7 +357,12 @@ cdef class Span:

    @property
    def sent(self):
-        """RETURNS (Span): The sentence span that the span is a part of."""
+        """Obtain the sentence that contains this span. If the given span
+        crosses sentence boundaries, return only the first sentence
+        to which it belongs.
+
+        RETURNS (Span): The sentence span that the span is a part of.
+        """
        if "sent" in self.doc.user_span_hooks:
            return self.doc.user_span_hooks["sent"](self)
        # Use `sent_start` token attribute to find sentence boundaries
@ -367,8 +372,8 @@ cdef class Span:
            start = self.start
            while self.doc.c[start].sent_start != 1 and start > 0:
                start += -1
-            # Find end of the sentence
-            end = self.end
+            # Find end of the sentence - can be within the entity
+            end = self.start + 1
            while end < self.doc.length and self.doc.c[end].sent_start != 1:
                end += 1
                n += 1
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -219,7 +219,7 @@ alignment mode `"strict".
 | `alignment_mode`                     | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
 | **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |

-## Doc.set_ents {#ents tag="method" new="3"}
+## Doc.set_ents {#set_ents tag="method" new="3"}

 Set the named entities in the document.

@ -633,12 +633,14 @@ not been implemeted for the given language, a `NotImplementedError` is raised.
 | ---------- | ------------------------------------- |
 | **YIELDS** | Noun chunks in the document. ~~Span~~ |

-## Doc.sents {#sents tag="property" model="parser"}
+## Doc.sents {#sents tag="property" model="sentences"}

-Iterate over the sentences in the document. Sentence spans have no label. To
-improve accuracy on informal texts, spaCy calculates sentence boundaries from
-the syntactic dependency parse. If the parser is disabled, the `sents` iterator
-will be unavailable.
+Iterate over the sentences in the document. Sentence spans have no label.
+
+This property is only available when
+[sentence boundaries](/usage/linguistic-features#sbd) have been set on the
+document by the `parser`, `senter`, `sentencizer` or some custom function. It
+will raise an error otherwise.

 > #### Example
 >
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@ -483,13 +483,40 @@ The L2 norm of the span's vector representation.
 | ----------- | --------------------------------------------------- |
 | **RETURNS** | The L2 norm of the vector representation. ~~float~~ |

+## Span.sent {#sent tag="property" model="sentences"}
+
+The sentence span that this span is a part of. This property is only available
+when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
+document by the `parser`, `senter`, `sentencizer` or some custom function. It
+will raise an error otherwise.
+
+If the span happens to cross sentence boundaries, only the first sentence will
+be returned. If it is required that the sentence always includes the
+full span, the result can be adjusted as such:
+
+```python
+sent = span.sent
+sent = doc[sent.start : max(sent.end, span.end)]
+```
+
+> #### Example
+>
+> ```python
+> doc = nlp("Give it back! He pleaded.")
+> span = doc[1:3]
+> assert span.sent.text == "Give it back!"
+> ```
+
+| Name        | Description                                             |
+| ----------- | ------------------------------------------------------- |
+| **RETURNS** | The sentence span that this span is a part of. ~~Span~~ |
+
 ## Attributes {#attributes}

 | Name                                    | Description                                                                                                                   |
 | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
 | `doc`                                   | The parent document. ~~Doc~~                                                                                                  |
 | `tensor` <Tag variant="new">2.1.7</Tag> | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                              |
-| `sent`                                  | The sentence span that this span is a part of. ~~Span~~                                                                       |
 | `start`                                 | The token offset for the start of the span. ~~int~~                                                                           |
 | `end`                                   | The token offset for the end of the span. ~~int~~                                                                             |
 | `start_char`                            | The character offset for the start of the span. ~~int~~                                                                       |
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -585,7 +585,7 @@ print(ent_francisco)  # ['Francisco', 'I', 'GPE']
 To ensure that the sequence of token annotations remains consistent, you have to
 set entity annotations **at the document level**. However, you can't write
 directly to the `token.ent_iob` or `token.ent_type` attributes, so the easiest
-way to set entities is to assign to the [`doc.ents`](/api/doc#ents) attribute
+way to set entities is to use the [`doc.set_ents`](/api/doc#set_ents) function
 and create the new entity as a [`Span`](/api/span).

 ```python