mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
span.ent only returns first sentence (#7084)
* return first sentence when span contains sentence boundary * docs fix * small fixes * cleanup
This commit is contained in:
parent
30e1a89aeb
commit
709c9e75af
|
@ -61,7 +61,6 @@ def test_issue7029():
|
|||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
texts = ["first", "second", "third", "fourth", "and", "then", "some", ""]
|
||||
nlp.select_pipes(enable=["tok2vec", "tagger"])
|
||||
docs1 = list(nlp.pipe(texts, batch_size=1))
|
||||
docs2 = list(nlp.pipe(texts, batch_size=4))
|
||||
assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
import pytest
|
||||
|
||||
from spacy.tokens.doc import Doc
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline._parser_internals.arc_eager import ArcEager
|
||||
|
|
18
spacy/tests/regression/test_issue7065.py
Normal file
18
spacy/tests/regression/test_issue7065.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from spacy.lang.en import English
|
||||
|
||||
|
||||
def test_issue7065():
|
||||
text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival."
|
||||
nlp = English()
|
||||
nlp.add_pipe("sentencizer")
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
patterns = [{"label": "THING", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}]
|
||||
ruler.add_patterns(patterns)
|
||||
|
||||
doc = nlp(text)
|
||||
sentences = [s for s in doc.sents]
|
||||
assert len(sentences) == 2
|
||||
sent0 = sentences[0]
|
||||
ent = doc.ents[0]
|
||||
assert ent.start < sent0.end < ent.end
|
||||
assert sentences.index(ent.sent) == 0
|
|
@ -357,7 +357,12 @@ cdef class Span:
|
|||
|
||||
@property
|
||||
def sent(self):
|
||||
"""RETURNS (Span): The sentence span that the span is a part of."""
|
||||
"""Obtain the sentence that contains this span. If the given span
|
||||
crosses sentence boundaries, return only the first sentence
|
||||
to which it belongs.
|
||||
|
||||
RETURNS (Span): The sentence span that the span is a part of.
|
||||
"""
|
||||
if "sent" in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks["sent"](self)
|
||||
# Use `sent_start` token attribute to find sentence boundaries
|
||||
|
@ -367,8 +372,8 @@ cdef class Span:
|
|||
start = self.start
|
||||
while self.doc.c[start].sent_start != 1 and start > 0:
|
||||
start += -1
|
||||
# Find end of the sentence
|
||||
end = self.end
|
||||
# Find end of the sentence - can be within the entity
|
||||
end = self.start + 1
|
||||
while end < self.doc.length and self.doc.c[end].sent_start != 1:
|
||||
end += 1
|
||||
n += 1
|
||||
|
|
|
@ -219,7 +219,7 @@ alignment mode `"strict".
|
|||
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
|
||||
## Doc.set_ents {#ents tag="method" new="3"}
|
||||
## Doc.set_ents {#set_ents tag="method" new="3"}
|
||||
|
||||
Set the named entities in the document.
|
||||
|
||||
|
@ -633,12 +633,14 @@ not been implemeted for the given language, a `NotImplementedError` is raised.
|
|||
| ---------- | ------------------------------------- |
|
||||
| **YIELDS** | Noun chunks in the document. ~~Span~~ |
|
||||
|
||||
## Doc.sents {#sents tag="property" model="parser"}
|
||||
## Doc.sents {#sents tag="property" model="sentences"}
|
||||
|
||||
Iterate over the sentences in the document. Sentence spans have no label. To
|
||||
improve accuracy on informal texts, spaCy calculates sentence boundaries from
|
||||
the syntactic dependency parse. If the parser is disabled, the `sents` iterator
|
||||
will be unavailable.
|
||||
Iterate over the sentences in the document. Sentence spans have no label.
|
||||
|
||||
This property is only available when
|
||||
[sentence boundaries](/usage/linguistic-features#sbd) have been set on the
|
||||
document by the `parser`, `senter`, `sentencizer` or some custom function. It
|
||||
will raise an error otherwise.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -483,13 +483,40 @@ The L2 norm of the span's vector representation.
|
|||
| ----------- | --------------------------------------------------- |
|
||||
| **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
|
||||
|
||||
## Span.sent {#sent tag="property" model="sentences"}
|
||||
|
||||
The sentence span that this span is a part of. This property is only available
|
||||
when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
|
||||
document by the `parser`, `senter`, `sentencizer` or some custom function. It
|
||||
will raise an error otherwise.
|
||||
|
||||
If the span happens to cross sentence boundaries, only the first sentence will
|
||||
be returned. If it is required that the sentence always includes the
|
||||
full span, the result can be adjusted as such:
|
||||
|
||||
```python
|
||||
sent = span.sent
|
||||
sent = doc[sent.start : max(sent.end, span.end)]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc = nlp("Give it back! He pleaded.")
|
||||
> span = doc[1:3]
|
||||
> assert span.sent.text == "Give it back!"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------- |
|
||||
| **RETURNS** | The sentence span that this span is a part of. ~~Span~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | The parent document. ~~Doc~~ |
|
||||
| `tensor` <Tag variant="new">2.1.7</Tag> | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ |
|
||||
| `sent` | The sentence span that this span is a part of. ~~Span~~ |
|
||||
| `start` | The token offset for the start of the span. ~~int~~ |
|
||||
| `end` | The token offset for the end of the span. ~~int~~ |
|
||||
| `start_char` | The character offset for the start of the span. ~~int~~ |
|
||||
|
|
|
@ -585,7 +585,7 @@ print(ent_francisco) # ['Francisco', 'I', 'GPE']
|
|||
To ensure that the sequence of token annotations remains consistent, you have to
|
||||
set entity annotations **at the document level**. However, you can't write
|
||||
directly to the `token.ent_iob` or `token.ent_type` attributes, so the easiest
|
||||
way to set entities is to assign to the [`doc.ents`](/api/doc#ents) attribute
|
||||
way to set entities is to use the [`doc.set_ents`](/api/doc#set_ents) function
|
||||
and create the new entity as a [`Span`](/api/span).
|
||||
|
||||
```python
|
||||
|
|
Loading…
Reference in New Issue
Block a user