diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py index cee48522d..8435b32e1 100644 --- a/spacy/tests/regression/test_issue7029.py +++ b/spacy/tests/regression/test_issue7029.py @@ -61,7 +61,6 @@ def test_issue7029(): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] - nlp.select_pipes(enable=["tok2vec", "tagger"]) docs1 = list(nlp.pipe(texts, batch_size=1)) docs2 = list(nlp.pipe(texts, batch_size=4)) assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] diff --git a/spacy/tests/regression/test_issue7056.py b/spacy/tests/regression/test_issue7056.py index 64a420b84..541144877 100644 --- a/spacy/tests/regression/test_issue7056.py +++ b/spacy/tests/regression/test_issue7056.py @@ -1,5 +1,3 @@ -import pytest - from spacy.tokens.doc import Doc from spacy.vocab import Vocab from spacy.pipeline._parser_internals.arc_eager import ArcEager diff --git a/spacy/tests/regression/test_issue7065.py b/spacy/tests/regression/test_issue7065.py new file mode 100644 index 000000000..897687d19 --- /dev/null +++ b/spacy/tests/regression/test_issue7065.py @@ -0,0 +1,18 @@ +from spacy.lang.en import English + + +def test_issue7065(): + text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." + nlp = English() + nlp.add_pipe("sentencizer") + ruler = nlp.add_pipe("entity_ruler") + patterns = [{"label": "THING", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}] + ruler.add_patterns(patterns) + + doc = nlp(text) + sentences = [s for s in doc.sents] + assert len(sentences) == 2 + sent0 = sentences[0] + ent = doc.ents[0] + assert ent.start < sent0.end < ent.end + assert sentences.index(ent.sent) == 0 diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 31eb1385b..06d86d2ac 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -357,7 +357,12 @@ cdef class Span: @property def sent(self): - """RETURNS (Span): The sentence span that the span is a part of.""" + """Obtain the sentence that contains this span. If the given span + crosses sentence boundaries, return only the first sentence + to which it belongs. + + RETURNS (Span): The sentence span that the span is a part of. + """ if "sent" in self.doc.user_span_hooks: return self.doc.user_span_hooks["sent"](self) # Use `sent_start` token attribute to find sentence boundaries @@ -367,8 +372,8 @@ cdef class Span: start = self.start while self.doc.c[start].sent_start != 1 and start > 0: start += -1 - # Find end of the sentence - end = self.end + # Find end of the sentence - can be within the entity + end = self.start + 1 while end < self.doc.length and self.doc.c[end].sent_start != 1: end += 1 n += 1 diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index a0b4c29bb..e4d24d2c0 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -219,7 +219,7 @@ alignment mode `"strict". | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | -## Doc.set_ents {#ents tag="method" new="3"} +## Doc.set_ents {#set_ents tag="method" new="3"} Set the named entities in the document. @@ -633,12 +633,14 @@ not been implemeted for the given language, a `NotImplementedError` is raised. | ---------- | ------------------------------------- | | **YIELDS** | Noun chunks in the document. ~~Span~~ | -## Doc.sents {#sents tag="property" model="parser"} +## Doc.sents {#sents tag="property" model="sentences"} -Iterate over the sentences in the document. Sentence spans have no label. To -improve accuracy on informal texts, spaCy calculates sentence boundaries from -the syntactic dependency parse. If the parser is disabled, the `sents` iterator -will be unavailable. +Iterate over the sentences in the document. Sentence spans have no label. + +This property is only available when +[sentence boundaries](/usage/linguistic-features#sbd) have been set on the +document by the `parser`, `senter`, `sentencizer` or some custom function. It +will raise an error otherwise. > #### Example > diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 37d18c62e..333344b31 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -483,13 +483,40 @@ The L2 norm of the span's vector representation. | ----------- | --------------------------------------------------- | | **RETURNS** | The L2 norm of the vector representation. ~~float~~ | +## Span.sent {#sent tag="property" model="sentences"} + +The sentence span that this span is a part of. This property is only available +when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the +document by the `parser`, `senter`, `sentencizer` or some custom function. It +will raise an error otherwise. + +If the span happens to cross sentence boundaries, only the first sentence will +be returned. If it is required that the sentence always includes the +full span, the result can be adjusted as such: + +```python +sent = span.sent +sent = doc[sent.start : max(sent.end, span.end)] +``` + +> #### Example +> +> ```python +> doc = nlp("Give it back! He pleaded.") +> span = doc[1:3] +> assert span.sent.text == "Give it back!" +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------- | +| **RETURNS** | The sentence span that this span is a part of. ~~Span~~ | + ## Attributes {#attributes} | Name | Description | | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | | `doc` | The parent document. ~~Doc~~ | | `tensor` 2.1.7 | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ | -| `sent` | The sentence span that this span is a part of. ~~Span~~ | | `start` | The token offset for the start of the span. ~~int~~ | | `end` | The token offset for the end of the span. ~~int~~ | | `start_char` | The character offset for the start of the span. ~~int~~ | diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 80a8eab1b..fd76c6e4d 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -585,7 +585,7 @@ print(ent_francisco) # ['Francisco', 'I', 'GPE'] To ensure that the sequence of token annotations remains consistent, you have to set entity annotations **at the document level**. However, you can't write directly to the `token.ent_iob` or `token.ent_type` attributes, so the easiest -way to set entities is to assign to the [`doc.ents`](/api/doc#ents) attribute +way to set entities is to use the [`doc.set_ents`](/api/doc#set_ents) function and create the new entity as a [`Span`](/api/span). ```python